import numpy as np
import pandas as pd
## Plotly plotting support
# import plotly.plotly as py
import plotly.offline as py
import plotly.figure_factory as ff
py.init_notebook_mode()
import cufflinks as cf
cf.go_offline() # required to use plotly offline (no account required).
import plotly.graph_objs as go
np.random.seed(42)
n = 75 # Number of records
noise = 3.5 # Noise in observations (we wouldn't know this in real life)
m = 1.5 # The true slope (we wouldn't know this in real life)
b = 10.0 # The true intercept (we wouldn't know this in real life)
# Make the data --------------------------
X = np.random.rand(n) * 20. - 10.
X.sort()
# The Y values are created using the secret model
# (We wouldn't have this in real-life either)
Y = m * X + b * np.sin(X) + np.random.randn(n) * noise
Y[20] = 7
Y[23] = -18
Y[30] = 20
Y[55] = -18
Y += 20
# Y[40] = -20
data = pd.DataFrame(dict(X = X, Y = Y))
del X, Y
data.to_csv("toy_training_data.csv", index=False)
raw_data = go.Scatter(name = "Data", x = data['X'], y = data['Y'], mode = 'markers')
py.iplot([raw_data])
np.random.seed(37)
n = 50 # Number of records
# Make the data --------------------------
X = np.random.rand(n) * 20. - 10.
X.sort()
# The Y values are created using the secret model
# (We wouldn't have this in real-life either)
Y = m * X + b * np.sin(X) + np.random.randn(n) * noise
Y[10] = 7
Y[23] = -7
Y[1] = -18
Y += 20
test_data = pd.DataFrame(dict(X = X, Y = Y))
del X, Y
test_data.to_csv("toy_test_data.csv", index=False)
np.random.seed(42)
flavor_prices = {
"Vanilla": 0.75,
"Chocolate": 0.8,
"Strawberry": 0.5
}
topping_prices = {
"Sprinkles": 0.3,
"Fruit": 1.0,
"Chocolate": 0.5,
"None": 0.0
}
n = 200
weights = np.random.rand(n) * 4 + 1.
flavors = np.random.choice(list(flavor_prices.keys()), n)
toppings = np.random.choice(list(topping_prices.keys()), n)
price = np.array([ "%.2f" % (w * flavor_prices[f] + topping_prices[t])
for (w,f,t) in zip(weights, flavors, toppings)]).astype('float')
icecream = pd.DataFrame({
"mass": np.round(weights,1),
"flavor": flavors,
"topping": toppings,
"price": price
}, columns=["flavor", "topping", "mass", "price"])
icecream.iloc[0:150,:].to_csv("icecream_train.csv", index=False)
icecream.iloc[150:,:].to_csv("icecream_test.csv", index=False)
icecream.head()
d = pd.get_dummies(df)
from sklearn.feature_extraction import DictVectorizer
flavor_enc = DictVectorizer()
flavor_enc.fit(icecream[["flavor"]].to_dict(orient='records'))
onehot_flavor = flavor_enc.transform(icecream[["flavor"]].to_dict(orient='records'))
topping_enc = DictVectorizer()
topping_enc.fit(icecream[["topping"]].to_dict(orient='records'))
onehot_topping = topping_enc.transform(icecream[["topping"]].to_dict(orient='records'))
import scipy as sp
f1 = sp.sparse.spdiags(icecream['weight'].values, 0, n, n) * onehot_flavor
phi = sp.sparse.hstack((f1, onehot_topping))
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(phi, icecream['price'])
yhat = reg.predict(phi)
np.round(reg.coef_,2)
q = yhat - icecream['price']
import plotly.figure_factory as ff
py.iplot(ff.create_distplot([yhat - icecream['price']], group_labels=["residuals"], bin_size=0.001))
onehot_flavor * icecream['mass'].values[:, np.newaxis]
len(icecream['weight'].values)
import scipy.sparse
scipy.sparse.spdiags(icecream['weight'].values, 0, n, n) * onehot_flavor